##Description DDSAnalytics is an analytics company that specializes in talent management solutions for Fortune 1000 companies. Talent management is defined as the iterative process of developing and retaining employees. It may include:
-workforce planning -employee training programs -identifying high-potential employees and -reducing/preventing voluntary employee turnover (attrition)
To gain a competitive edge over its competition, DDSAnalytics is planning to leverage data science for talent management. The executive leadership has identified predicting employee turnover as its first application of data science for talent management. Before the business green lights the project, they have tasked your data science team to conduct an analysis of existing employee data.
#read in data
CSD <- read.csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%2014%20and%2015%20Case%20Study%202/CaseStudy2-data.csv", header = T)
NoAttr <- read.csv("/Users/lnelson/Box/MSDS/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2CompSet No Attrition.csv")
Salary <- read_excel("/Users/lnelson/Box/MSDS/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2CompSet No Salary.xlsx")
glimpse(CSD) # Show Data Structure
## Rows: 870
## Columns: 36
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ Age <int> 32, 40, 35, 32, 24, 27, 41, 37, 34, 34, 43, 2…
## $ Attrition <chr> "No", "No", "No", "No", "No", "No", "No", "No…
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Rarely", "Travel_Fre…
## $ DailyRate <int> 117, 1308, 200, 801, 567, 294, 1283, 309, 133…
## $ Department <chr> "Sales", "Research & Development", "Research …
## $ DistanceFromHome <int> 13, 14, 18, 1, 2, 10, 5, 10, 10, 10, 6, 1, 7,…
## $ Education <int> 4, 3, 2, 4, 1, 2, 5, 4, 4, 4, 3, 2, 3, 1, 2, …
## $ EducationField <chr> "Life Sciences", "Medical", "Life Sciences", …
## $ EmployeeCount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber <int> 859, 1128, 1412, 2016, 1646, 733, 1448, 1105,…
## $ EnvironmentSatisfaction <int> 2, 3, 3, 3, 1, 4, 2, 4, 3, 4, 1, 3, 3, 3, 4, …
## $ Gender <chr> "Male", "Male", "Male", "Female", "Female", "…
## $ HourlyRate <int> 73, 44, 60, 48, 32, 32, 90, 88, 87, 92, 81, 4…
## $ JobInvolvement <int> 3, 2, 3, 3, 3, 3, 4, 2, 3, 2, 2, 3, 3, 3, 3, …
## $ JobLevel <int> 2, 5, 3, 3, 1, 3, 1, 2, 1, 2, 5, 1, 3, 1, 1, …
## $ JobRole <chr> "Sales Executive", "Research Director", "Manu…
## $ JobSatisfaction <int> 4, 3, 4, 4, 4, 1, 3, 4, 3, 3, 3, 4, 3, 2, 1, …
## $ MaritalStatus <chr> "Divorced", "Single", "Single", "Married", "S…
## $ MonthlyIncome <int> 4403, 19626, 9362, 10422, 3760, 8793, 2127, 6…
## $ MonthlyRate <int> 9250, 17544, 19944, 24032, 17218, 4809, 5561,…
## $ NumCompaniesWorked <int> 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 7, 1, 3, 1, 6, …
## $ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ OverTime <chr> "No", "No", "No", "No", "Yes", "No", "Yes", "…
## $ PercentSalaryHike <int> 11, 14, 11, 19, 13, 21, 12, 14, 19, 14, 13, 1…
## $ PerformanceRating <int> 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, …
## $ RelationshipSatisfaction <int> 3, 1, 3, 3, 3, 3, 1, 3, 4, 2, 4, 2, 2, 1, 3, …
## $ StandardHours <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel <int> 1, 0, 0, 2, 0, 2, 0, 3, 1, 1, 0, 1, 0, 1, 0, …
## $ TotalWorkingYears <int> 8, 21, 10, 14, 6, 9, 7, 8, 1, 8, 21, 3, 17, 1…
## $ TrainingTimesLastYear <int> 3, 2, 2, 3, 2, 4, 5, 5, 2, 3, 2, 2, 3, 3, 3, …
## $ WorkLifeBalance <int> 2, 4, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, …
## $ YearsAtCompany <int> 5, 20, 2, 14, 6, 9, 4, 1, 1, 8, 16, 3, 8, 1, …
## $ YearsInCurrentRole <int> 2, 7, 2, 10, 3, 7, 2, 0, 1, 2, 12, 2, 5, 0, 6…
## $ YearsSinceLastPromotion <int> 0, 4, 2, 5, 1, 1, 0, 0, 0, 7, 6, 2, 1, 0, 5, …
## $ YearsWithCurrManager <int> 3, 9, 2, 7, 3, 7, 3, 0, 0, 7, 14, 2, 6, 0, 7,…
names <- c(3,4,6,9,13,17,19,23,24) #Categorical Variables
head(names)
## [1] 3 4 6 9 13 17
CSD[,names] <-lapply(CSD[,names] , factor) #convert chr to factor
as.numeric(as.character(names))
## [1] 3 4 6 9 13 17 19 23 24
glimpse(CSD) # Show Data Structure
## Rows: 870
## Columns: 36
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ Age <int> 32, 40, 35, 32, 24, 27, 41, 37, 34, 34, 43, 2…
## $ Attrition <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ BusinessTravel <fct> Travel_Rarely, Travel_Rarely, Travel_Frequent…
## $ DailyRate <int> 117, 1308, 200, 801, 567, 294, 1283, 309, 133…
## $ Department <fct> Sales, Research & Development, Research & Dev…
## $ DistanceFromHome <int> 13, 14, 18, 1, 2, 10, 5, 10, 10, 10, 6, 1, 7,…
## $ Education <int> 4, 3, 2, 4, 1, 2, 5, 4, 4, 4, 3, 2, 3, 1, 2, …
## $ EducationField <fct> Life Sciences, Medical, Life Sciences, Market…
## $ EmployeeCount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EmployeeNumber <int> 859, 1128, 1412, 2016, 1646, 733, 1448, 1105,…
## $ EnvironmentSatisfaction <int> 2, 3, 3, 3, 1, 4, 2, 4, 3, 4, 1, 3, 3, 3, 4, …
## $ Gender <fct> Male, Male, Male, Female, Female, Male, Male,…
## $ HourlyRate <int> 73, 44, 60, 48, 32, 32, 90, 88, 87, 92, 81, 4…
## $ JobInvolvement <int> 3, 2, 3, 3, 3, 3, 4, 2, 3, 2, 2, 3, 3, 3, 3, …
## $ JobLevel <int> 2, 5, 3, 3, 1, 3, 1, 2, 1, 2, 5, 1, 3, 1, 1, …
## $ JobRole <fct> Sales Executive, Research Director, Manufactu…
## $ JobSatisfaction <int> 4, 3, 4, 4, 4, 1, 3, 4, 3, 3, 3, 4, 3, 2, 1, …
## $ MaritalStatus <fct> Divorced, Single, Single, Married, Single, Di…
## $ MonthlyIncome <int> 4403, 19626, 9362, 10422, 3760, 8793, 2127, 6…
## $ MonthlyRate <int> 9250, 17544, 19944, 24032, 17218, 4809, 5561,…
## $ NumCompaniesWorked <int> 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 7, 1, 3, 1, 6, …
## $ Over18 <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
## $ OverTime <fct> No, No, No, No, Yes, No, Yes, Yes, Yes, No, N…
## $ PercentSalaryHike <int> 11, 14, 11, 19, 13, 21, 12, 14, 19, 14, 13, 1…
## $ PerformanceRating <int> 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3, …
## $ RelationshipSatisfaction <int> 3, 1, 3, 3, 3, 3, 1, 3, 4, 2, 4, 2, 2, 1, 3, …
## $ StandardHours <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8…
## $ StockOptionLevel <int> 1, 0, 0, 2, 0, 2, 0, 3, 1, 1, 0, 1, 0, 1, 0, …
## $ TotalWorkingYears <int> 8, 21, 10, 14, 6, 9, 7, 8, 1, 8, 21, 3, 17, 1…
## $ TrainingTimesLastYear <int> 3, 2, 2, 3, 2, 4, 5, 5, 2, 3, 2, 2, 3, 3, 3, …
## $ WorkLifeBalance <int> 2, 4, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, …
## $ YearsAtCompany <int> 5, 20, 2, 14, 6, 9, 4, 1, 1, 8, 16, 3, 8, 1, …
## $ YearsInCurrentRole <int> 2, 7, 2, 10, 3, 7, 2, 0, 1, 2, 12, 2, 5, 0, 6…
## $ YearsSinceLastPromotion <int> 0, 4, 2, 5, 1, 1, 0, 0, 0, 7, 6, 2, 1, 0, 5, …
## $ YearsWithCurrManager <int> 3, 9, 2, 7, 3, 7, 3, 0, 0, 7, 14, 2, 6, 0, 7,…
CSD$Attrition<-ifelse(CSD$Attrition=="Yes",1,0) # Converted Attrition to a logical operation. 1 = Yes, 0 = No
CSD$Attrition<-as.numeric(as.character(CSD$Attrition))
CSD$OverTime<-ifelse(CSD$OverTime=="Yes",1,0) # Converted Overtime to a logical operation, 1 = Yes, 0 = NO
CSD$OverTime<-as.numeric(as.character(CSD$OverTime))
sapply(CSD, class)
## ID Age Attrition
## "integer" "integer" "numeric"
## BusinessTravel DailyRate Department
## "factor" "integer" "factor"
## DistanceFromHome Education EducationField
## "integer" "integer" "factor"
## EmployeeCount EmployeeNumber EnvironmentSatisfaction
## "integer" "integer" "integer"
## Gender HourlyRate JobInvolvement
## "factor" "integer" "integer"
## JobLevel JobRole JobSatisfaction
## "integer" "factor" "integer"
## MaritalStatus MonthlyIncome MonthlyRate
## "factor" "integer" "integer"
## NumCompaniesWorked Over18 OverTime
## "integer" "factor" "numeric"
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## "integer" "integer" "integer"
## StandardHours StockOptionLevel TotalWorkingYears
## "integer" "integer" "integer"
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## "integer" "integer" "integer"
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## "integer" "integer" "integer"
#data prep and cleaning
#check CSD for NAs
sapply(CSD,function(x) sum(is.na(x)))
## ID Age Attrition
## 0 0 0
## BusinessTravel DailyRate Department
## 0 0 0
## DistanceFromHome Education EducationField
## 0 0 0
## EmployeeCount EmployeeNumber EnvironmentSatisfaction
## 0 0 0
## Gender HourlyRate JobInvolvement
## 0 0 0
## JobLevel JobRole JobSatisfaction
## 0 0 0
## MaritalStatus MonthlyIncome MonthlyRate
## 0 0 0
## NumCompaniesWorked Over18 OverTime
## 0 0 0
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## 0 0 0
## StandardHours StockOptionLevel TotalWorkingYears
## 0 0 0
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 0 0 0
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 0 0 0
# drop ID, EmployeeCount, EmployeeNumber, Over18, StandardHours -- no value for EDA
CSD <- subset(CSD,select=c(2,3,4,5,6,7,8,9,12,13,14,15,16,17,18,19,20,21,22,24,25,26,27,29,30,31,32,33,34,35,36))
sapply(CSD, class)
## Age Attrition BusinessTravel
## "integer" "numeric" "factor"
## DailyRate Department DistanceFromHome
## "integer" "factor" "integer"
## Education EducationField EnvironmentSatisfaction
## "integer" "factor" "integer"
## Gender HourlyRate JobInvolvement
## "factor" "integer" "integer"
## JobLevel JobRole JobSatisfaction
## "integer" "factor" "integer"
## MaritalStatus MonthlyIncome MonthlyRate
## "factor" "integer" "integer"
## NumCompaniesWorked OverTime PercentSalaryHike
## "integer" "numeric" "integer"
## PerformanceRating RelationshipSatisfaction StockOptionLevel
## "integer" "integer" "integer"
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## "integer" "integer" "integer"
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## "integer" "integer" "integer"
## YearsWithCurrManager
## "integer"
#Distance from Home
#EnvironmentSatisfaction
#PerformanceRating
#Exploratory Data Analysis
#structure of the data
str(CSD)
## 'data.frame': 870 obs. of 31 variables:
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ OverTime : num 0 0 0 0 1 0 1 1 1 0 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
#dimensions
dim(CSD)
## [1] 870 31
#high-level view of continuous variables
plot_histogram(CSD, binary_as_factor = TRUE, geom_histogram_args = list(bins = 30L), scale_x = "continuous")
plot_density(CSD)
#Categorical Variables-Barplots
plot_bar(CSD)
#Interesting Insights into the Data
#Data
cordata = CSD[,c(1,2,4,6,7,9,11,12,13,15,17,18,19,21,22,23,24,25,26,27,28,29,30,31)] #Numerical Variables
corr <- round(cor(cordata), 1) #Create Correlation Matrix
#corr #Correlation Matrix
#Job Level and Monthly Income = 1
#Total Working Years and Job level = .8
#Total Working Years and Monthly Income = .8
#Percent Salary Hike and Performance Rating = .8
#Age and Total Working Years = .7
#Indicates a linear relationship between Total Working Years and JobLevel
plot(CSD$TotalWorkingYears,CSD$JobLevel, main="Correlation between Total Working Years & Job Level", xlab="Total Working Years", ylab="Job Level")
cor(CSD$TotalWorkingYears, CSD$JobLevel)
## [1] 0.7807524
cor.test(CSD$TotalWorkingYears, CSD$JobLevel)
##
## Pearson's product-moment correlation
##
## data: CSD$TotalWorkingYears and CSD$JobLevel
## t = 36.813, df = 868, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7533821 0.8054221
## sample estimates:
## cor
## 0.7807524
#Indicates a linear relationship between Total Working Years and Monthly Income
plot(CSD$TotalWorkingYears,CSD$MonthlyIncome, main="Correlation between Total Working Years & Monthly Income", xlab="Total Working Years", ylab="Monthly Income")
cor(CSD$TotalWorkingYears,CSD$MonthlyIncome)
## [1] 0.7785112
cor.test(CSD$TotalWorkingYears,CSD$MonthlyIncome)
##
## Pearson's product-moment correlation
##
## data: CSD$TotalWorkingYears and CSD$MonthlyIncome
## t = 36.544, df = 868, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7509003 0.8034053
## sample estimates:
## cor
## 0.7785112
#Indicates a linear relationship between Percent Salary Hike and Performance Rating
plot(CSD$PercentSalaryHike,CSD$PerformanceRating, main="Correlation between Percent Salary Hike & Performance Rating", xlab="Percent Salary Hike", ylab="Performance Rating")
cor(CSD$PercentSalaryHike,CSD$PerformanceRating)
## [1] 0.7750532
cor.test(CSD$PercentSalaryHike,CSD$PerformanceRating)
##
## Pearson's product-moment correlation
##
## data: CSD$PercentSalaryHike and CSD$PerformanceRating
## t = 36.136, df = 868, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7470726 0.8002923
## sample estimates:
## cor
## 0.7750532
#Relationship between Categorical Variables
#Frequency Table
# 2 - way table
#Attrition by OverTime
BarChart(Attrition, data = CSD, by=OverTime, stack100=TRUE, main = "Attrition by OverTime", outer = TRUE, fill=c("black", "gold3"))
## >>> Suggestions
## Plot(Attrition, OverTime) # bubble plot
## BarChart(Attrition, by=OverTime, horiz=TRUE) # horizontal bar chart
## BarChart(Attrition, fill="steelblue") # steelblue bars
##
##
## Joint and Marginal Frequencies
## ------------------------------
##
## Attrition
## OverTime 0 1 Sum
## 0 558 60 618
## 1 172 80 252
## Sum 730 140 870
##
##
## Cramer's V (phi): 0.272
##
## Chi-square Test: Chisq = 64.383, df = 1, p-value = 0.000
##
##
## Cell Proportions within Each Column
## -----------------------------------
##
## Attrition
## OverTime 0 1
## 0 0.764 0.429
## 1 0.236 0.571
## Sum 1.000 1.000
#2 Marital_Status-Single
two_wayMA = table(CSD$MaritalStatus, CSD$Attrition)
two_wayMA
##
## 0 1
## Divorced 179 12
## Married 352 58
## Single 199 70
prop.table(two_wayMA) # cell percentages
##
## 0 1
## Divorced 0.20574713 0.01379310
## Married 0.40459770 0.06666667
## Single 0.22873563 0.08045977
#More Attrition among Single Marital Status
BarChart(Attrition, data = CSD, by=MaritalStatus, stack100=TRUE, main = "Attrition by Marital Status", outer = TRUE, fill=c("black", "gold3", "turquoise3"))
## >>> Suggestions
## Plot(Attrition, MaritalStatus) # bubble plot
## BarChart(Attrition, by=MaritalStatus, horiz=TRUE) # horizontal bar chart
## BarChart(Attrition, fill="steelblue") # steelblue bars
##
##
## Joint and Marginal Frequencies
## ------------------------------
##
## Attrition
## MaritalStatus 0 1 Sum
## Divorced 179 12 191
## Married 352 58 410
## Single 199 70 269
## Sum 730 140 870
##
##
## Cramer's V: 0.199
##
## Chi-square Test: Chisq = 34.406, df = 2, p-value = 0.000
##
##
## Cell Proportions within Each Column
## -----------------------------------
##
## Attrition
## MaritalStatus 0 1
## Divorced 0.245 0.086
## Married 0.482 0.414
## Single 0.273 0.500
## Sum 1.000 1.000
#3 Job Role Sales Representative
two_wayJS = table(CSD$JobRole, CSD$Attrition)
two_wayJS
##
## 0 1
## Healthcare Representative 68 8
## Human Resources 21 6
## Laboratory Technician 123 30
## Manager 47 4
## Manufacturing Director 85 2
## Research Director 50 1
## Research Scientist 140 32
## Sales Executive 167 33
## Sales Representative 29 24
prop.table(two_wayJS) # cell percentages
##
## 0 1
## Healthcare Representative 0.078160920 0.009195402
## Human Resources 0.024137931 0.006896552
## Laboratory Technician 0.141379310 0.034482759
## Manager 0.054022989 0.004597701
## Manufacturing Director 0.097701149 0.002298851
## Research Director 0.057471264 0.001149425
## Research Scientist 0.160919540 0.036781609
## Sales Executive 0.191954023 0.037931034
## Sales Representative 0.033333333 0.027586207
#Attrition by Job Role
BarChart(Attrition, data = CSD, by=JobRole, stack100=TRUE, main = "Attrition by Job Role", outer = TRUE, )
## >>> Suggestions
## Plot(Attrition, JobRole) # bubble plot
## BarChart(Attrition, by=JobRole, horiz=TRUE) # horizontal bar chart
## BarChart(Attrition, fill="steelblue") # steelblue bars
##
##
## Joint and Marginal Frequencies
## ------------------------------
##
## Attrition
## JobRole 0 1 Sum
## Healthcare Representative 68 8 76
## Human Resources 21 6 27
## Laboratory Technician 123 30 153
## Manager 47 4 51
## Manufacturing Director 85 2 87
## Research Director 50 1 51
## Research Scientist 140 32 172
## Sales Executive 167 33 200
## Sales Representative 29 24 53
## Sum 730 140 870
##
##
## Cramer's V: 0.264
##
## Chi-square Test: Chisq = 60.543, df = 8, p-value = 0.000
## >>> Low cell expected frequencies, chi-squared approximation may not be accurate
##
##
## Cell Proportions within Each Column
## -----------------------------------
##
## Attrition
## JobRole 0 1
## Healthcare Representative 0.093 0.057
## Human Resources 0.029 0.043
## Laboratory Technician 0.168 0.214
## Manager 0.064 0.029
## Manufacturing Director 0.116 0.014
## Research Director 0.068 0.007
## Research Scientist 0.192 0.229
## Sales Executive 0.229 0.236
## Sales Representative 0.040 0.171
## Sum 1.000 1.000
BarChart(Attrition, data = CSD)
## >>> Suggestions
## BarChart(Attrition, horiz=TRUE) # horizontal bar chart
## BarChart(Attrition, fill="reds") # red bars of varying lightness
## PieChart(Attrition) # doughnut (ring) chart
## Plot(Attrition) # bubble plot
## Plot(Attrition, stat="count") # lollipop plot
##
##
## --- Attrition ---
##
##
## Missing Values of Attrition: 0
##
##
## 0 1 Total
## Frequencies: 730 140 870
## Proportions: 0.839 0.161 1.000
##
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 400.115, df = 1, p-value = 0.000
BarChart(JobRole, data = CSD)
## >>> Suggestions
## BarChart(JobRole, horiz=TRUE) # horizontal bar chart
## BarChart(JobRole, fill="reds") # red bars of varying lightness
## PieChart(JobRole) # doughnut (ring) chart
## Plot(JobRole) # bubble plot
## Plot(JobRole, stat="count") # lollipop plot
##
##
## --- JobRole ---
##
##
## Missing Values of JobRole: 0
##
##
## JobRole Count Prop
## ---------------------------------------
## Healthcare Representative 76 0.087
## Human Resources 27 0.031
## Laboratory Technician 153 0.176
## Manager 51 0.059
## Manufacturing Director 87 0.100
## Research Director 51 0.059
## Research Scientist 172 0.198
## Sales Executive 200 0.230
## Sales Representative 53 0.061
## ---------------------------------------
## Total 870 1.000
##
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 320.462, df = 8, p-value = 0.000
#Job Role by Gender
BarChart(JobRole, by=Gender, stack100=TRUE, main = "Job Role by Gender", data = CSD, fill = c("pink3","blue3"))
## >>> Suggestions
## Plot(JobRole, Gender) # bubble plot
## BarChart(JobRole, by=Gender, horiz=TRUE) # horizontal bar chart
## BarChart(JobRole, fill="steelblue") # steelblue bars
##
##
##
## JobRole Gender Count
## Healthcare Representative Female 33
## Healthcare Representative Male 43
## Healthcare Representative Sum 76
## Human Resources Female 9
## Human Resources Male 18
## Human Resources Sum 27
## Laboratory Technician Female 51
## Laboratory Technician Male 102
## Laboratory Technician Sum 153
## Manager Female 25
## Manager Male 26
## Manager Sum 51
## Manufacturing Director Female 43
## Manufacturing Director Male 44
## Manufacturing Director Sum 87
## Research Director Female 23
## Research Director Male 28
## Research Director Sum 51
## Research Scientist Female 65
## Research Scientist Male 107
## Research Scientist Sum 172
## Sales Executive Female 79
## Sales Executive Male 121
## Sales Executive Sum 200
## Sales Representative Female 26
## Sales Representative Male 27
## Sales Representative Sum 53
## Sum Female 354
## Sum Male 516
## Sum Sum 870
##
##
## Cramer's V: 0.113
##
## Chi-square Test: Chisq = 11.153, df = 8, p-value = 0.193
##
##
##
## JobRole Gender Count
## Healthcare Representative Female 0.434
## Healthcare Representative Male 0.566
## Healthcare Representative Sum 1.000
## Human Resources Female 0.333
## Human Resources Male 0.667
## Human Resources Sum 1.000
## Laboratory Technician Female 0.333
## Laboratory Technician Male 0.667
## Laboratory Technician Sum 1.000
## Manager Female 0.490
## Manager Male 0.510
## Manager Sum 1.000
## Manufacturing Director Female 0.494
## Manufacturing Director Male 0.506
## Manufacturing Director Sum 1.000
## Research Director Female 0.451
## Research Director Male 0.549
## Research Director Sum 1.000
## Research Scientist Female 0.378
## Research Scientist Male 0.622
## Research Scientist Sum 1.000
## Sales Executive Female 0.395
## Sales Executive Male 0.605
## Sales Executive Sum 1.000
## Sales Representative Female 0.491
## Sales Representative Male 0.509
## Sales Representative Sum 1.000
#Distribution of Marital Status
BarChart(MaritalStatus, data = CSD, legend_labels = TRUE)
## >>> Suggestions
## BarChart(MaritalStatus, horiz=TRUE) # horizontal bar chart
## BarChart(MaritalStatus, fill="reds") # red bars of varying lightness
## PieChart(MaritalStatus) # doughnut (ring) chart
## Plot(MaritalStatus) # bubble plot
## Plot(MaritalStatus, stat="count") # lollipop plot
##
##
## --- MaritalStatus ---
##
##
## Missing Values of MaritalStatus: 0
##
##
## Divorced Married Single Total
## Frequencies: 191 410 269 870
## Proportions: 0.220 0.471 0.309 1.000
##
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 84.972, df = 2, p-value = 0.000
#Marital Status Distribution by Gender
BarChart(MaritalStatus, by=Gender, stack100=TRUE, main = "Marital Status by
Gender", data = CSD, fill = c("pink3","blue3"))
## >>> Suggestions
## Plot(MaritalStatus, Gender) # bubble plot
## BarChart(MaritalStatus, by=Gender, horiz=TRUE) # horizontal bar chart
## BarChart(MaritalStatus, fill="steelblue") # steelblue bars
##
##
## Joint and Marginal Frequencies
## ------------------------------
##
## MaritalStatus
## Gender Divorced Married Single Sum
## Female 61 177 116 354
## Male 130 233 153 516
## Sum 191 410 269 870
##
##
## Cramer's V: 0.094
##
## Chi-square Test: Chisq = 7.769, df = 2, p-value = 0.021
##
##
## Cell Proportions within Each Column
## -----------------------------------
##
## MaritalStatus
## Gender Divorced Married Single
## Female 0.319 0.432 0.431
## Male 0.681 0.568 0.569
## Sum 1.000 1.000 1.000
BarChart(OverTime, data = CSD)
## >>> Suggestions
## BarChart(OverTime, horiz=TRUE) # horizontal bar chart
## BarChart(OverTime, fill="reds") # red bars of varying lightness
## PieChart(OverTime) # doughnut (ring) chart
## Plot(OverTime) # bubble plot
## Plot(OverTime, stat="count") # lollipop plot
##
##
## --- OverTime ---
##
##
## Missing Values of OverTime: 0
##
##
## 0 1 Total
## Frequencies: 618 252 870
## Proportions: 0.710 0.290 1.000
##
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 153.972, df = 1, p-value = 0.000
BarChart(EducationField, data = CSD, by1=Attrition)
## [Trellis graphics from Deepayan Sarkar's lattice package]
##
## EducationField Count
## Human Resources 0 11
## Human Resources 1 4
## Human Resources Sum 15
## Life Sciences 0 305
## Life Sciences 1 53
## Life Sciences Sum 358
## Marketing 0 80
## Marketing 1 20
## Marketing Sum 100
## Medical 0 233
## Medical 1 37
## Medical Sum 270
## Other 0 43
## Other 1 9
## Other Sum 52
## Technical Degree 0 58
## Technical Degree 1 17
## Technical Degree Sum 75
## Sum 0 730
## Sum 1 140
## Sum Sum 870
##
##
## Cramer's V: 0.086
##
## Chi-square Test: Chisq = 6.411, df = 5, p-value = 0.268
## >>> Low cell expected frequencies, chi-squared approximation may not be accurate
#BarChart(Attrition, data = CSD, by1=BusinessTravel)
BarChart(Attrition, data = CSD, by=BusinessTravel, stack100=TRUE, main = "Attrition by Business Travel", outer = TRUE)
## >>> Suggestions
## Plot(Attrition, BusinessTravel) # bubble plot
## BarChart(Attrition, by=BusinessTravel, horiz=TRUE) # horizontal bar chart
## BarChart(Attrition, fill="steelblue") # steelblue bars
##
##
## Joint and Marginal Frequencies
## ------------------------------
##
## Attrition
## BusinessTravel 0 1 Sum
## Non-Travel 83 11 94
## Travel_Frequently 123 35 158
## Travel_Rarely 524 94 618
## Sum 730 140 870
##
##
## Cramer's V: 0.083
##
## Chi-square Test: Chisq = 5.994, df = 2, p-value = 0.050
##
##
## Cell Proportions within Each Column
## -----------------------------------
##
## Attrition
## BusinessTravel 0 1
## Non-Travel 0.114 0.079
## Travel_Frequently 0.168 0.250
## Travel_Rarely 0.718 0.671
## Sum 1.000 1.000
#More Attrition among Single Marital Status
BarChart(Attrition, data = CSD, by=MaritalStatus, stack100=TRUE, main = "Attrition by Marital Status", outer = TRUE, fill=c("turquoise", "blue3", "forestgreen"))
## >>> Suggestions
## Plot(Attrition, MaritalStatus) # bubble plot
## BarChart(Attrition, by=MaritalStatus, horiz=TRUE) # horizontal bar chart
## BarChart(Attrition, fill="steelblue") # steelblue bars
##
##
## Joint and Marginal Frequencies
## ------------------------------
##
## Attrition
## MaritalStatus 0 1 Sum
## Divorced 179 12 191
## Married 352 58 410
## Single 199 70 269
## Sum 730 140 870
##
##
## Cramer's V: 0.199
##
## Chi-square Test: Chisq = 34.406, df = 2, p-value = 0.000
##
##
## Cell Proportions within Each Column
## -----------------------------------
##
## Attrition
## MaritalStatus 0 1
## Divorced 0.245 0.086
## Married 0.482 0.414
## Single 0.273 0.500
## Sum 1.000 1.000
#Attrition by Gender
BarChart(Attrition, data = CSD, by=Gender, stack100=TRUE, main = "Attrition by Gender", outer = TRUE, fill=c("pink3", "blue3"))
## >>> Suggestions
## Plot(Attrition, Gender) # bubble plot
## BarChart(Attrition, by=Gender, horiz=TRUE) # horizontal bar chart
## BarChart(Attrition, fill="steelblue") # steelblue bars
##
##
## Joint and Marginal Frequencies
## ------------------------------
##
## Attrition
## Gender 0 1 Sum
## Female 301 53 354
## Male 429 87 516
## Sum 730 140 870
##
##
## Cramer's V (phi): 0.025
##
## Chi-square Test: Chisq = 0.555, df = 1, p-value = 0.456
##
##
## Cell Proportions within Each Column
## -----------------------------------
##
## Attrition
## Gender 0 1
## Female 0.412 0.379
## Male 0.588 0.621
## Sum 1.000 1.000
#Attrition by Job Satisfaction - lower job satisfaction = increased attrition
BarChart(JobSatisfaction, data = CSD, by=Attrition, stack100=TRUE, main = "Attrition by Job Satisfaction", outer = TRUE)
## >>> Suggestions
## Plot(JobSatisfaction, Attrition) # bubble plot
## BarChart(JobSatisfaction, by=Attrition, horiz=TRUE) # horizontal bar chart
## BarChart(JobSatisfaction, fill="steelblue") # steelblue bars
##
##
## Joint and Marginal Frequencies
## ------------------------------
##
## JobSatisfaction
## Attrition 1 2 3 4 Sum
## 0 141 135 211 243 730
## 1 38 31 43 28 140
## Sum 179 166 254 271 870
##
##
## Cramer's V: 0.113
##
## Chi-square Test: Chisq = 11.109, df = 3, p-value = 0.011
##
##
## Cell Proportions within Each Column
## -----------------------------------
##
## JobSatisfaction
## Attrition 1 2 3 4
## 0 0.788 0.813 0.831 0.897
## 1 0.212 0.187 0.169 0.103
## Sum 1.000 1.000 1.000 1.000
CSD %>%
group_by(Attrition, Gender) %>%
summarise(Attrition_Percent= 100 * n()/ nrow(CSD))
## `summarise()` has grouped output by 'Attrition'. You can override using the `.groups` argument.
## # A tibble: 4 × 3
## # Groups: Attrition [2]
## Attrition Gender Attrition_Percent
## <dbl> <fct> <dbl>
## 1 0 Female 34.6
## 2 0 Male 49.3
## 3 1 Female 6.09
## 4 1 Male 10
# ** Only 16 % of our data actually left their position for one reason or another**
# Note to self: This may cause problems in the modeling phase. Consider sampling ideas.
#JobRole Trends
CSD %>% # Viz
ggplot(aes(JobRole, fill = JobRole))+
geom_bar(aes(y=(..count..)/sum(..count..)))+
theme(legend.position = "None")+
scale_y_continuous(labels= percent)+
labs(title= "Job Role Percentages", y = "Percentage", x = "Job Role",tag = "Fig. 1" )+
theme_igray()
# Income by Job Title Distributions
CSD %>%
ggplot(aes(reorder(x=JobRole,-MonthlyIncome),y=MonthlyIncome, fill= JobRole))+
geom_boxplot()+
scale_y_continuous(labels= dollar_format())+
labs(title= "Income by Role", y = "Income", x = "Job Title", tag = "Fig. 3")+
theme_economist_white()+
theme(legend.position = "none",axis.title.y = element_text(vjust= 4) )+
guides(x = guide_axis(n.dodge= 2))
# 16 Percent left thier role
CSD %>%
group_by(Attrition) %>%
summarise(Attrition_Percent= 100 * n()/ nrow(CSD))
## # A tibble: 2 × 2
## Attrition Attrition_Percent
## <dbl> <dbl>
## 1 0 83.9
## 2 1 16.1
# Attrition by Job Role
CSD %>%
ggplot(aes(JobRole, y=(..count..)/sum(..count..), fill=Attrition))+
geom_bar()+
theme_economist_white()+
theme(axis.title.x = element_text(vjust=-4))+
labs(title= "Attrition By Job Role", x = "Job Role", y= "", tag= "Fig. 2")+
scale_y_continuous(labels = percent)
# Sales Reps have high attrition, despite having a Job Satisfaction rating in the upper percentile of our data. Monthly Income
CSD%>%
ggplot(aes(Gender))+
geom_histogram(stat="count")+
facet_grid(~JobSatisfaction)+
labs(title= "Job Satisfaction by Gender", y="", tag= "Fig. 3")+
theme_economist_white()+
theme(axis.text.y = element_blank())
# Women seem to be not satisfied with their work situation
CSD %>%
ggplot(aes(Gender, MonthlyIncome))+
geom_bar(stat="identity")+
facet_wrap(~JobRole)+
scale_y_continuous(labels=dollar_format())+
labs(title = "Income by Gender & Job Role", tag = "Fig.4",y="Monthly Income")+
theme_economist_white()+
theme(axis.title.y = element_text(vjust = 2))
# Women are paid less than men
CSD %>%
ggplot(aes(Gender,Education, fill = factor(Education)))+
geom_bar(stat="identity", position = "dodge")+
facet_wrap(~JobRole)+
theme_economist_white()+
theme(axis.title.y = element_text(vjust = 2))+
labs(title = "Education Levels by Job Role", y="Education Level", tag="Fig.3")
#Identify Top 3 factors that lead to attrition
#### We will now start looking for the correlations in the data between variables.
#### correlations between variables
#continuous
plot_correlation(CSD, type = "continuous", geom_text_args = list(),theme_config = list(legend.position = "right", axis.text.x = element_text(angle =
90)))
#discrete
plot_correlation(CSD, type = "discrete", geom_text_args = list(),theme_config = list(legend.position = "right", axis.text.x = element_text(angle =
90)))
#1 OverTime
#2 Marital_Status-Single
#3 Job Role Sales Representative
#Identify Job Role specific trends
#Attrition by Job Satisfaction - lower job satisfaction = increased attrition
BarChart(JobRole, data = CSD, by1=JobSatisfaction)
## [Trellis graphics from Deepayan Sarkar's lattice package]
## Table output is vertical to fit in window, but > 30 rows
## To view the complete table, save the output
## to an object, e.g., b <- BarChart(...)
## then b$freq
##
##
## Cramer's V: 0.100
##
## Chi-square Test: Chisq = 26.048, df = 24, p-value = 0.351
KNN - Overview of Process
#Output should be “Case2PredictionsNelson-EaddyAttrition.csv – add to github repo
#install.packages("caTools")
library(caTools)
#Classify Attrition into two classes:
#1. True
#2. False
attrition.subset <- CSD[c(6,20,22)]
#CSD[,attrition.subset$Attrition] <-lapply(CSD[,attrition.subset$Attrition] , factor) #convert chr to factor
#CSD$Attrition<-ifelse(CSD$Attrition=="Yes",1,0)
#CSD$Attrition<-as.numeric(as.character(CSD$Attrition))
#Normalization
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x))) }
attrition.subset.n <- as.data.frame(lapply(attrition.subset[,1:3], normalize))
#Data Splicing
set.seed(5)
attr.d <- sample(1:nrow(attrition.subset.n),size=nrow(attrition.subset.n)*0.7,replace = FALSE) #random selection of 70% data.
train.attrition <- attrition.subset[attr.d,] # 70% training data
test.attrition <- attrition.subset[-attr.d,] # remaining 30% test data
#Creating seperate dataframe for 'Attrition' feature which is our target.
train.attrition_labels <- attrition.subset[attr.d,2]
test.attrition_labels <-attrition.subset[-attr.d,2]
#Find the number of observation
NROW(train.attrition_labels)
## [1] 609
sqrt(609) #25
## [1] 24.67793
knn.25 <- knn(train=train.attrition, test=test.attrition, cl=train.attrition_labels, k=25)
knn.11 <- knn(train=train.attrition, test=test.attrition, cl=train.attrition_labels, k=11)
#Model Evaluation
#Calculate the proportion of correct classification for k = 25, 1
ACC.25 <- 100 * sum(test.attrition_labels == knn.25)/NROW(test.attrition_labels)
ACC.25
## [1] 83.14176
ACC.11 <- 100 * sum(test.attrition_labels == knn.11)/NROW(test.attrition_labels)
ACC.11
## [1] 93.86973
confusionMatrix(table(knn.25 ,test.attrition_labels))
## Confusion Matrix and Statistics
##
## test.attrition_labels
## knn.25 0 1
## 0 189 41
## 1 3 28
##
## Accuracy : 0.8314
## 95% CI : (0.7804, 0.8748)
## No Information Rate : 0.7356
## P-Value [Acc > NIR] : 0.0001668
##
## Kappa : 0.4737
##
## Mcnemar's Test P-Value : 0.00000002434
##
## Sensitivity : 0.9844
## Specificity : 0.4058
## Pos Pred Value : 0.8217
## Neg Pred Value : 0.9032
## Prevalence : 0.7356
## Detection Rate : 0.7241
## Detection Prevalence : 0.8812
## Balanced Accuracy : 0.6951
##
## 'Positive' Class : 0
##
confusionMatrix(table(knn.11 ,test.attrition_labels))
## Confusion Matrix and Statistics
##
## test.attrition_labels
## knn.11 0 1
## 0 192 16
## 1 0 53
##
## Accuracy : 0.9387
## 95% CI : (0.9024, 0.9646)
## No Information Rate : 0.7356
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.8297
##
## Mcnemar's Test P-Value : 0.0001768
##
## Sensitivity : 1.0000
## Specificity : 0.7681
## Pos Pred Value : 0.9231
## Neg Pred Value : 1.0000
## Prevalence : 0.7356
## Detection Rate : 0.7356
## Detection Prevalence : 0.7969
## Balanced Accuracy : 0.8841
##
## 'Positive' Class : 0
##
i=1
k.optm=1
for (i in 1:30) {
knn.mod <- knn(train=train.attrition,test=test.attrition, cl=train.attrition_labels, k=i)
k.optm[i] <- 100 * sum(test.attrition_labels == knn.mod)/NROW(test.attrition_labels)
k=i
cat(k,'=',k.optm[i],'')
}
## 1 = 98.46743 2 = 98.46743 3 = 96.93487 4 = 98.08429 5 = 98.46743 6 = 97.70115 7 = 95.4023 8 = 94.25287 9 = 94.25287 10 = 93.86973 11 = 93.86973 12 = 93.86973 13 = 93.86973 14 = 93.10345 15 = 92.33716 16 = 91.57088 17 = 89.27203 18 = 90.03831 19 = 88.12261 20 = 86.59004 21 = 86.59004 22 = 86.2069 23 = 82.37548 24 = 82.37548 25 = 83.90805 26 = 83.14176 27 = 82.75862 28 = 81.99234 29 = 83.90805 30 = 83.14176
setwd("/Users/lnelson/Box/MSDS/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/")
RMSE<-function(error) {sqrt(mean(error^2))} #Function to get RMSE programatically
Incomesplit<- .80
SM<-CSD%>%
select(JobLevel,TotalWorkingYears, MonthlyIncome, Age, JobRole)
# Model Creation
Trainindicieslm= sample(1:dim(SM)[1], round(Incomesplit * dim(SM)[1]))
LRMtr<- SM[Trainindicieslm,]
LRMte<- SM[-Trainindicieslm,]
fit=lm(MonthlyIncome~., data=LRMtr)
# Model Creation
# Model Test RMSE < 3000
prediction<-predict.lm(fit, newdata = Salary)
rmse(LRMte$MonthlyIncome, predict(fit, LRMte)) #Sanity Check
## [1] 1011.848
# 935.93
summary(prediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2074 2614 5778 6219 6427 19782
predictiondf<-as.data.frame(prediction)
Salary$Salary<-predictiondf[,1]
CSPRED = Salary %>%
select("ID","Salary") %>%
arrange(ID)
write.csv(CSPRED, file = "Case2Predictions_Nelson_Eaddy Salary.csv")
#NaiveBayes model
NBCSD = CSD %>% filter(!is.na(DistanceFromHome) & !is.na(OverTime) & !is.na(PerformanceRating) & !is.na(Attrition))
set.seed(9)
trainIndices1 = sample(seq(1:length(NBCSD$Attrition)),round(.7*length(NBCSD$Attrition)))
train1 = NBCSD[trainIndices1,]
test1 = NBCSD[-trainIndices1,]
head(train1)
## Age Attrition BusinessTravel DailyRate Department
## 187 25 0 Travel_Frequently 772 Research & Development
## 565 30 0 Travel_Rarely 921 Research & Development
## 262 29 1 Travel_Rarely 224 Research & Development
## 827 31 1 Travel_Rarely 1079 Sales
## 408 36 0 Travel_Rarely 917 Research & Development
## 595 40 0 Travel_Rarely 884 Research & Development
## DistanceFromHome Education EducationField EnvironmentSatisfaction Gender
## 187 2 1 Life Sciences 4 Male
## 565 1 3 Life Sciences 4 Male
## 262 1 4 Technical Degree 1 Male
## 827 16 4 Marketing 1 Male
## 408 6 4 Life Sciences 3 Male
## 595 15 3 Life Sciences 1 Female
## HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction
## 187 77 4 2 Manufacturing Director 3
## 565 38 1 1 Laboratory Technician 3
## 262 100 2 1 Research Scientist 1
## 827 70 3 3 Sales Executive 3
## 408 60 1 1 Laboratory Technician 3
## 595 80 2 3 Manufacturing Director 3
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked OverTime
## 187 Divorced 5206 4973 1 0
## 565 Married 3833 24375 3 0
## 262 Single 2362 7568 6 0
## 827 Married 8161 19002 2 0
## 408 Divorced 2741 6865 1 0
## 595 Married 10435 25800 1 0
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## 187 17 3 3
## 565 21 4 3
## 262 13 3 3
## 827 13 3 1
## 408 14 3 3
## 595 13 3 4
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## 187 2 7 6 3
## 565 2 7 2 3
## 262 0 11 2 1
## 827 3 10 2 3
## 408 1 7 4 3
## 595 2 18 2 3
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## 187 7 7 0
## 565 2 2 0
## 262 9 7 0
## 827 1 0 0
## 408 7 7 1
## 595 18 15 14
## YearsWithCurrManager
## 187 7
## 565 2
## 262 7
## 827 0
## 408 7
## 595 12
head(test1)
## Age Attrition BusinessTravel DailyRate Department
## 2 40 0 Travel_Rarely 1308 Research & Development
## 4 32 0 Travel_Rarely 801 Sales
## 5 24 0 Travel_Frequently 567 Research & Development
## 6 27 0 Travel_Frequently 294 Research & Development
## 7 41 0 Travel_Rarely 1283 Research & Development
## 8 37 0 Travel_Rarely 309 Sales
## DistanceFromHome Education EducationField EnvironmentSatisfaction Gender
## 2 14 3 Medical 3 Male
## 4 1 4 Marketing 3 Female
## 5 2 1 Technical Degree 1 Female
## 6 10 2 Life Sciences 4 Male
## 7 5 5 Medical 2 Male
## 8 10 4 Life Sciences 4 Female
## HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction
## 2 44 2 5 Research Director 3
## 4 48 3 3 Sales Executive 4
## 5 32 3 1 Research Scientist 4
## 6 32 3 3 Manufacturing Director 1
## 7 90 4 1 Research Scientist 3
## 8 88 2 2 Sales Executive 4
## MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked OverTime
## 2 Single 19626 17544 1 0
## 4 Married 10422 24032 1 0
## 5 Single 3760 17218 1 1
## 6 Divorced 8793 4809 1 0
## 7 Married 2127 5561 2 1
## 8 Divorced 6694 24223 2 1
## PercentSalaryHike PerformanceRating RelationshipSatisfaction StockOptionLevel
## 2 14 3 1 0
## 4 19 3 3 2
## 5 13 3 3 0
## 6 21 4 3 2
## 7 12 3 1 0
## 8 14 3 3 3
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 2 21 2 4 20
## 4 14 3 3 14
## 5 6 2 3 6
## 6 9 4 2 9
## 7 7 5 2 4
## 8 8 5 3 1
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 2 7 4 9
## 4 10 5 7
## 5 3 1 3
## 6 7 1 7
## 7 2 0 3
## 8 0 0 0
model1 = naiveBayes(train1[,c(6,20,22)],(train1$Attrition),laplace = 1)
table(predict(model1,test1[,c(6,20,22)]),(test1$Attrition))
##
## 0 1
## 0 216 36
## 1 7 2
CM5 = confusionMatrix(table(predict(model1,test1[,c(6,20,22)]),(test1$Attrition)))
CM5
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 216 36
## 1 7 2
##
## Accuracy : 0.8352
## 95% CI : (0.7846, 0.8781)
## No Information Rate : 0.8544
## P-Value [Acc > NIR] : 0.8333
##
## Kappa : 0.0311
##
## Mcnemar's Test P-Value : 0.00001955
##
## Sensitivity : 0.96861
## Specificity : 0.05263
## Pos Pred Value : 0.85714
## Neg Pred Value : 0.22222
## Prevalence : 0.85441
## Detection Rate : 0.82759
## Detection Prevalence : 0.96552
## Balanced Accuracy : 0.51062
##
## 'Positive' Class : 0
##
Create a full EDA report